{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 105,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 根据89611条新闻数据，其中包括78855条新华社新闻，通过多项式分布实现对于抄袭新华社新闻的检测。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 106,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(89611, 7)\n",
      "      id      author                  source  \\\n",
      "0  89617         NaN  快科技@http://www.kkj.cn/   \n",
      "1  89616         NaN  快科技@http://www.kkj.cn/   \n",
      "2  89615         NaN  快科技@http://www.kkj.cn/   \n",
      "3  89614         NaN                     新华社   \n",
      "4  89613  胡淑丽_MN7479                   深圳大件事   \n",
      "\n",
      "                                             content  \\\n",
      "0  此外，自本周（6月12日）起，除小米手机6等15款机型外，其余机型已暂停更新发布（含开发版/...   \n",
      "1  骁龙835作为唯一通过Windows 10桌面平台认证的ARM处理器，高通强调，不会因为只考...   \n",
      "2  此前的一加3T搭载的是3400mAh电池，DashCharge快充规格为5V/4A。\\r\\n...   \n",
      "3    这是6月18日在葡萄牙中部大佩德罗冈地区拍摄的被森林大火烧毁的汽车。新华社记者张立云摄\\r\\n   \n",
      "4  （原标题：44岁女子跑深圳约会网友被拒，暴雨中裸身奔走……）\\r\\n@深圳交警微博称：昨日清...   \n",
      "\n",
      "                                             feature  \\\n",
      "0  {\"type\":\"科技\",\"site\":\"cnbeta\",\"commentNum\":\"37\"...   \n",
      "1  {\"type\":\"科技\",\"site\":\"cnbeta\",\"commentNum\":\"15\"...   \n",
      "2  {\"type\":\"科技\",\"site\":\"cnbeta\",\"commentNum\":\"18\"...   \n",
      "3  {\"type\":\"国际新闻\",\"site\":\"环球\",\"commentNum\":\"0\",\"j...   \n",
      "4  {\"type\":\"新闻\",\"site\":\"网易热门\",\"commentNum\":\"978\",...   \n",
      "\n",
      "                           title  \\\n",
      "0           小米MIUI 9首批机型曝光：共计15款   \n",
      "1     骁龙835在Windows 10上的性能表现有望改善   \n",
      "2      一加手机5细节曝光：3300mAh、充半小时用1天   \n",
      "3  葡森林火灾造成至少62人死亡 政府宣布进入紧急状态（组图）   \n",
      "4       44岁女子约网友被拒暴雨中裸奔 交警为其披衣相随   \n",
      "\n",
      "                                                 url  \n",
      "0     http://www.cnbeta.com/articles/tech/623597.htm  \n",
      "1     http://www.cnbeta.com/articles/tech/623599.htm  \n",
      "2     http://www.cnbeta.com/articles/tech/623601.htm  \n",
      "3  http://world.huanqiu.com/hot/2017-06/10866126....  \n",
      "4  http://news.163.com/17/0618/00/CN617P3Q0001875...  \n",
      "         id author     source content  \\\n",
      "100   89517    NaN  中国证券报?中证网     NaN   \n",
      "103   89514    NaN  中国证券报?中证网     NaN   \n",
      "997   88620    NaN        央广网     NaN   \n",
      "1273  88344    NaN        央广网     NaN   \n",
      "1282  88335    NaN        央广网     NaN   \n",
      "\n",
      "                                                feature  \\\n",
      "100   {\"type\":\"公司\",\"site\":\"中证网\",\"commentNum\":\"0\",\"jo...   \n",
      "103   {\"type\":\"公司\",\"site\":\"中证网\",\"commentNum\":\"0\",\"jo...   \n",
      "997   {\"type\":\"时事要闻\",\"site\":\"参考消息\",\"commentNum\":\"0\",...   \n",
      "1273  {\"type\":\"IT业界\",\"site\":\"参考消息\",\"commentNum\":\"0\",...   \n",
      "1282  {\"type\":\"IT业界\",\"site\":\"参考消息\",\"commentNum\":\"0\",...   \n",
      "\n",
      "                                 title  \\\n",
      "100       天和防务股东未来6个月内计划减持不超过480万股公司股份   \n",
      "103                    晶盛机电调整限制性股票回购价格   \n",
      "997              [主播不在家]第二季：主播陈亮体验垃圾清运   \n",
      "1273                LKK洛可可：想象力经济时代或已到来   \n",
      "1282  CES2017：京东发布两款叮咚智能音箱新品 开放Alpha平台   \n",
      "\n",
      "                                                    url  \n",
      "100   http://www.cs.com.cn/ssgs/gsxw/201706/t2017062...  \n",
      "103   http://www.cs.com.cn/ssgs/gsxw/201706/t2017062...  \n",
      "997   http://www.cankaoxiaoxi.com/china/20170623/214...  \n",
      "1273  http://www.cankaoxiaoxi.com/science/20170610/2...  \n",
      "1282  http://www.cankaoxiaoxi.com/science/20170610/2...  \n",
      "(87054, 7)\n"
     ]
    }
   ],
   "source": [
    "import re\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import jieba\n",
    "from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer\n",
    "from sklearn.model_selection import train_test_split, cross_validate\n",
    "from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score\n",
    "from sklearn.naive_bayes import MultinomialNB\n",
    "from sklearn.metrics.pairwise import cosine_similarity\n",
    "import pickle\n",
    "from tqdm import tqdm\n",
    "from pprint import pprint\n",
    "import os\n",
    "\n",
    "# 加载停用词\n",
    "with open('chinese_stopwords.txt','r', encoding='utf-8') as file:\n",
    "    stopwords=[i[:-1] for i in file.readlines()]\n",
    "#stopwords = [line.strip() for line in open('chinsesstoptxt.txt',encoding='UTF-8').readlines()]\n",
    "\n",
    "# 数据加载\n",
    "news = pd.read_csv('sqlResult.csv',encoding='gb18030')\n",
    "print(news.shape)\n",
    "print(news.head(5))\n",
    "\n",
    "# 处理缺失值\n",
    "print(news[news.content.isnull()].head(5))\n",
    "news=news.dropna(subset=['content'])\n",
    "print(news.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "此外，自本周（6月12日）起，除小米手机6等15款机型外，其余机型已暂停更新发布（含开发版/体验版内测，稳定版暂不受影响），以确保工程师可以集中全部精力进行系统优化工作。有人猜测这也是将精力主要用到MIUI 9的研发之中。\r\n",
      "MIUI 8去年5月发布，距今已有一年有余，也是时候更新换代了。\r\n",
      "当然，关于MIUI 9的确切信息，我们还是等待官方消息。\r\n",
      "\n",
      "此外 本周 除 小米 手机 款 机型 外 机型 暂停 更新 发布 含 开发 版 体验版 内测 稳定版 暂不受 影响 确保 工程师 集中 全部 精力 进行 系统优化 工作 有人 猜测 精力 主要 用到 MIUI9 研发 之中 \r",
      " MIUI8 去年 发布 距今已有 一年 有余 更新换代 \r",
      " 当然 MIUI9 确切 信息 等待 官方消息\n"
     ]
    }
   ],
   "source": [
    "# 分词并去无效词\n",
    "def split_text(text):\n",
    "    #return ' '.join([w for w in list(jieba.cut(re.sub('\\s|[%s]' % (punctuation),'',text))) if w not in stopwords])\n",
    "    text = text.replace(' ', '')\n",
    "    text = text.replace('\\n', '')\n",
    "    text2 = jieba.cut(text.strip())\n",
    "    result = ' '.join([w for w in text2 if w not in stopwords])\n",
    "    return result\n",
    "\n",
    "print(news.iloc[0].content)\n",
    "print(split_text(news.iloc[0].content))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "此外 本周 除 小米 手机 款 机型 外 机型 暂停 更新 发布 含 开发 版 体验版 内测 稳定版 暂不受 影响 确保 工程师 集中 全部 精力 进行 系统优化 工作 有人 猜测 精力 主要 用到 MIUI9 研发 之中 \r",
      " MIUI8 去年 发布 距今已有 一年 有余 更新换代 \r",
      " 当然 MIUI9 确切 信息 等待 官方消息\n",
      "87054\n",
      "骁龙 835 唯一 Windows10 桌面 平台 认证 ARM 处理器 高通 强调 不会 只 考虑 性能 屏蔽掉 小 核心 相反 正 联手 微软 找到 一种 适合 桌面 平台 兼顾 性能 功耗 完美 方案 \r",
      " 报道 微软 已经 拿到 一些 源码 Windows10 更好 理解 big little 架构 \r",
      " 资料 显示 骁龙 835 一款 集成 CPU GPU 基带 蓝牙 Wi Fi SoC 传统 Wintel 方案 节省 至少 30% PCB 空间 \r",
      " 按计划 今年 Q4 华硕 惠普 联想 首发 骁龙 835Win10 电脑 预计 均 二合一 形态 产品 \r",
      " 当然 高通 骁龙 未来 也许 见到 三星 Exynos 联发科 华为 麒麟 小米 澎湃 进入 Windows10 桌面 平台\n"
     ]
    }
   ],
   "source": [
    "#收集并处理content列\n",
    "if not os.path.exists(\"corpus.pkl\"):\n",
    "    # 对所有文本进行分词\n",
    "    corpus=list(map(split_text,[str(i) for i in news.content]))\n",
    "    print(corpus[0])\n",
    "    print(len(corpus))\n",
    "    print(corpus[1])\n",
    "    # 保存到文件，方便下次调用\n",
    "    with open('corpus.pkl','wb') as file:\n",
    "        pickle.dump(corpus, file)\n",
    "else:\n",
    "    # 调用上次处理的结果\n",
    "    with open('corpus.pkl','rb') as file:\n",
    "        corpus = pickle.load(file)\n",
    "        print(corpus[0])\n",
    "        print(len(corpus))\n",
    "        print(corpus[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 112,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(87054, 884)\n",
      "  (0, 590)\t0.237736608488\n",
      "  (0, 461)\t0.287807002707\n",
      "  (0, 271)\t0.431560057041\n",
      "  (0, 416)\t0.263569509136\n",
      "  (0, 432)\t0.217387276604\n",
      "  (0, 669)\t0.267150299769\n",
      "  (0, 860)\t0.267413014466\n",
      "  (0, 184)\t0.246785583728\n",
      "  (0, 822)\t0.150818545995\n",
      "  (0, 385)\t0.185018693496\n",
      "  (0, 103)\t0.201078563673\n",
      "  (0, 667)\t0.283109136374\n",
      "  (0, 263)\t0.229198530311\n",
      "  (0, 44)\t0.262255632828\n",
      "  (0, 174)\t0.234573238002\n"
     ]
    }
   ],
   "source": [
    "# 得到corpus的TF-IDF矩阵\n",
    "countvectorizer = CountVectorizer(encoding='gb18030',min_df=0.015)\n",
    "tfidftransformer = TfidfTransformer()\n",
    "\n",
    "countvector = countvectorizer.fit_transform(corpus)\n",
    "print(countvector.shape)\n",
    "\n",
    "tfidf = tfidftransformer.fit_transform(countvector)\n",
    "print(tfidf[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 114,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "accuracy: 0.889726997741\n",
      "precison: 0.962384953982\n",
      "recall: 0.914101102242\n",
      "f1_score: 0.937621832359\n"
     ]
    }
   ],
   "source": [
    "# 分类是否为新华社新闻\n",
    "label=list(map(lambda source: 1 if '新华' in str(source) else 0,news.source))\n",
    "#print(label)\n",
    "\n",
    "# 数据集切分\n",
    "X_train, X_test, y_train, y_test = train_test_split(tfidf.toarray(), label, test_size = 0.3, random_state=42)\n",
    "clf = MultinomialNB()\n",
    "clf.fit(X=X_train, y=y_train)\n",
    "\n",
    "\"\"\"\n",
    "# 进行CV=3折交叉验证\n",
    "scores=cross_validate(clf, X_train, y_train, scoring=('accuracy','precision','recall','f1'), cv=3, return_train_score=True)\n",
    "pprint(scores)\n",
    "\"\"\"\n",
    "y_predict = clf.predict(X_test)\n",
    "def show_test_reslt(y_true,y_pred):\n",
    "    print('accuracy:',accuracy_score(y_true,y_pred))\n",
    "    print('precison:',precision_score(y_true,y_pred))\n",
    "    print('recall:',recall_score(y_true,y_pred))\n",
    "    print('f1_score:',f1_score(y_true,y_pred))\n",
    "\n",
    "show_test_reslt(y_test, y_predict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 117,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "可能为Copy的新闻索引： Int64Index([   4,   24,   28,   30,   32,   35,   37,   38,   43,   45,\n",
      "            ...\n",
      "            8538, 8539, 8543, 8544, 8546, 8548, 8549, 8551, 8552, 8555],\n",
      "           dtype='int64', length=2818)\n"
     ]
    }
   ],
   "source": [
    "# 使用模型检测抄袭新闻\n",
    "prediction = clf.predict(tfidf.toarray())\n",
    "labels = np.array(label)\n",
    "# compare_news_index中有两列：prediction为预测，labels为真实值\n",
    "compare_news_index = pd.DataFrame({'prediction':prediction,'labels':labels})\n",
    "# copy_news_index：可能是Copy的新闻（即找到预测为1，但是实际不是“新华社”）\n",
    "copy_news_index=compare_news_index[(compare_news_index['prediction'] == 1) & (compare_news_index['labels'] == 0)].index\n",
    "# 实际为新华社的新闻\n",
    "xinhuashe_news_index=compare_news_index[(compare_news_index['labels'] == 1)].index\n",
    "print('可能为Copy的新闻索引：', copy_news_index)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 118,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(87054,)\n",
      "[16 16 16 ..., 16 16 23]\n"
     ]
    }
   ],
   "source": [
    "#文章聚类后保存列表\n",
    "if not os.path.exists(\"label.pkl\"):\n",
    "    # 使用k-means对文章进行聚类\n",
    "    from sklearn.preprocessing import Normalizer\n",
    "    from sklearn.cluster import KMeans\n",
    "    normalizer = Normalizer()\n",
    "    scaled_array = normalizer.fit_transform(tfidf.toarray())\n",
    "\n",
    "    # 使用K-Means, 对全量文档进行聚类\n",
    "    kmeans = KMeans(n_clusters=25,random_state=42,n_jobs=-1)\n",
    "    k_labels = kmeans.fit_predict(scaled_array)\n",
    "    # 保存到文件，方便下次调用\n",
    "    with open('label.pkl','wb') as file:\n",
    "        pickle.dump(k_labels, file)\n",
    "    print(k_labels.shape)\n",
    "    print(k_labels[0])\n",
    "else:\n",
    "    # 调用上次处理的结果\n",
    "    with open('label.pkl','rb') as file:\n",
    "        k_labels = pickle.load(file)\n",
    "        print(k_labels.shape)\n",
    "        print(k_labels)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 126,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "16\n"
     ]
    }
   ],
   "source": [
    "#将类别对应编号保存列表\n",
    "if not os.path.exists(\"id_class.pkl\"):\n",
    "    # 创建id_class\n",
    "    id_class = {index:class_ for index, class_ in enumerate(k_labels)}\n",
    "    # 保存到文件，方便下次调用\n",
    "    with open('id_class.pkl','wb') as file:\n",
    "        pickle.dump(id_class, file)\n",
    "else:\n",
    "    # 调用上次处理的结果\n",
    "    with open('id_class.pkl','rb') as file:\n",
    "        id_class = pickle.load(file)\n",
    "        print(id_class[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 130,
   "metadata": {},
   "outputs": [],
   "source": [
    "#把属于新华社的文章列表按类别收纳到25个字典中\n",
    "if not os.path.exists(\"class_id.pkl\"):\n",
    "    from collections import defaultdict\n",
    "    # 创建你class_id字段，key为classId,value为文档index\n",
    "    class_id = defaultdict(set)\n",
    "    for index,class_ in id_class.items():\n",
    "        # 只统计新华社发布的class_id\n",
    "        if index in xinhuashe_news_index.tolist():\n",
    "            class_id[class_].add(index)\n",
    "    # 保存到文件，方便下次调用\n",
    "    with open('class_id.pkl','wb') as file:\n",
    "        pickle.dump(class_id, file)\n",
    "else:\n",
    "    # 调用上次处理的结果\n",
    "    with open('class_id.pkl','rb') as file:\n",
    "        class_id = pickle.load(file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 131,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0 12324\n",
      "1 10360\n",
      "2 6919\n",
      "3 3789\n",
      "4 3506\n",
      "5 1851\n",
      "6 1653\n",
      "7 1762\n",
      "8 728\n",
      "9 2280\n",
      "10 949\n",
      "11 1732\n",
      "12 1603\n",
      "13 1474\n",
      "14 1564\n",
      "15 9749\n",
      "16 1843\n",
      "17 874\n",
      "18 2124\n",
      "19 2008\n",
      "20 1358\n",
      "21 2844\n",
      "22 2800\n",
      "23 1254\n",
      "24 1507\n"
     ]
    }
   ],
   "source": [
    "# 输出每个类别的 文档个数\n",
    "count=0\n",
    "for k in class_id:\n",
    "    print(count, len(class_id[k]))\n",
    "    count +=1\n",
    "\n",
    "# 查找相似文本（使用聚类结果进行filter）\n",
    "def find_similar_text(cpindex, top=10):\n",
    "    # 只在新华社发布的文章中查找\n",
    "    dist_dict={i:cosine_similarity(tfidf[cpindex],tfidf[i]) for i in class_id[id_class[cpindex]]}\n",
    "    # 从大到小进行排序\n",
    "    return sorted(dist_dict.items(),key=lambda x:x[1][0], reverse=True)[:top]\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 133,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "是否在新华社 False\n",
      "是否在copy_news True\n",
      "[(54858, array([[ 0.4790132]])), (54976, array([[ 0.47379451]])), (18495, array([[ 0.46644004]])), (75084, array([[ 0.46600637]])), (82080, array([[ 0.45851198]])), (52291, array([[ 0.45596554]])), (76849, array([[ 0.45554416]])), (86111, array([[ 0.43596508]])), (24206, array([[ 0.43159321]])), (86001, array([[ 0.42976636]]))]\n",
      "怀疑抄袭:\n",
      " 中超联赛第13轮比赛已经全部结束，这轮比赛中最让人觉得结果有些出乎意料的，莫过于卡佩罗执教江苏苏宁的主场首秀却以失败告终，在其他球队换帅如换刀的背景下，苏宁却持续陷入低迷，13轮过后只积8分，排名倒数第2，近3年1铁律预示苏宁想保级除非创造奇迹。\n",
      "就在去年也就是2016赛季，由于中超第21轮被提前，因此中超第21轮比赛结束后也就相当于如今的13轮比赛过后，当时杭州绿城只积9分，山东鲁能由于少赛一场，也积9分，同处降级区，但是鲁能在第13场比赛战平上港，因此13场过后积分应为10分，最终杭州绿城降级，鲁能惊险保级。\n",
      "再往前推，2015赛季，中超13轮比赛过后，贵州人和与上海申鑫排在倒数第1和第2位，贵州人和积10分，而上海申鑫只有4分，最终这2支球队全部降级，也就是说当时第13轮比赛的降级区球队成为了最终联赛结束后真正降级的2支球队。\n",
      "然后回到2014赛季，前13轮比赛过后，身处降级区的是哈尔滨毅腾和河南建业两支球队，河南建业已经有10分入账，而哈尔滨毅腾在少赛一场的情况下只积5分，即使算上哈尔滨毅腾第13场比赛能赢，最多也是8分，最终哈尔滨毅腾降级。\n",
      "从这3年的中超13轮比赛过后的积分榜来看，会发现一个铁律，那就是所有在13轮比赛结束后，还低于10分的球队，最终全部降级，无一幸免。如今江苏苏宁在13轮过后竟然只获得了1场胜利，积分只有8分，接下来的比赛除非创造奇迹打破这三年来的铁律，才可能保级，苏宁从现在开始真的是为保级而战了！（老邱中超球评）\n",
      "\n",
      "相似原文:\n",
      " 　　新华社南京５月１０日电　亚冠小组赛综述：中超创造历史\\n　　新华社记者　王恒志\\n　　２０１７赛季亚冠小组赛１０日尘埃落定，中超球队在这个赛季的小组赛中表现出色，历史上首次有三支球队杀进亚冠１６强，同时江苏苏宁队创造提前两轮出线和小组赛积分最高（１５分）历史、广州恒大队则成为首支小组赛不败的中超球队。而淘汰赛尚未开打，另一项历史已经出现：苏宁将和上海上港队送出亚冠赛场上的首次“中国德比”。\\n　　拥有众多大牌外教和外援的中超这个赛季在亚冠赛场上可谓威风八面，江苏苏宁队和上海上港队都早早提前锁定小组出线名额，苏宁更是创纪录地以四连胜提前两轮锁定小组头名。不过，从比赛内容来说，上港的场上表现毫无疑问更为出色，奥斯卡、胡尔克等球员的火热状态让他们一路高歌猛进，苏宁虽然连战连捷，但人员不整的他们在有些场次场面上处于下风，只是凭借拉米雷斯和特谢拉的“灵光一闪”才一路高奏凯歌。\\n　　不过，苏宁在最后一轮仍然在创造历史，以全华班出战的他们客场１:０小胜澳大利亚的阿德莱德联队，１５分的积分创造了２００９年亚冠改制后中超球队小组赛最高纪录，中超球队全华班取胜也是头一遭。这个赛季的苏宁堪称“亚冠球队”，尽管联赛八轮不胜，但在亚冠赛场他们队史首次出线，同时还缔造了一系列中超纪录。\\n　　令人稍显意外的是广州恒大，他们直到最后一轮还要确保不输球才能晋级。好在恒大底蕴犹在，最终以小组第二的身份出线。但同时，恒大仍然创造了历史，２平４负的不败战绩让他们成为改制后首支小组赛不败的中超球队。\\n　　更重要的是，这是首次有三支中超球队打进亚冠１６强，此前中超的最好战绩是两支球队出线，本赛季上海申花队在资格赛落败，只有三支球队参加小组赛，结果他们全部晋级，给了中国球迷一个大惊喜。\\n　　与之相对的是，本赛季澳大利亚和韩国球队呈“溃败”之势，澳超全军覆没，韩国Ｋ联赛也仅存济州联队一支“独苗”。不过，日本球队表现更为强势，他们四支球队出战三支出线，而且浦和红钻、鹿岛鹿角和川崎前锋三队均是以小组头名晋级。此外泰国的蒙通联队获得一张东亚区１６强席位。\\n　　令很多中国球迷惋惜的是，虽然有三支中超球队晋级，但１６强战就将上演“中国德比”。上港在１０日的小组赛最后一轮２:３落败，在浦和红钻输球的情况下，没能反超对手登上积分榜首，获得小组第二的他们将在淘汰赛阶段迎战江苏苏宁队。虽然早早同室操戈，但这也将创造历史：这是中超球队在亚冠上的首度“德比”。恒大的对手则是鹿岛鹿角队，虽然小组赛表现不尽如人意，但贵为两届亚冠冠军的恒大后劲不容小觑。（完）\\n\n",
      "编辑距离: 931\n"
     ]
    }
   ],
   "source": [
    "import editdistance\n",
    "# 指定某篇文章的相似度\n",
    "#print(copy_news_index)\n",
    "cpindex = 28 # 在copy_news_index\n",
    "print('是否在新华社', cpindex in xinhuashe_news_index)\n",
    "print('是否在copy_news', cpindex in copy_news_index)\n",
    "#print('28', 28 in xinhuashe_news_index)\n",
    "\n",
    "#print(cpindex)\n",
    "similar_list = find_similar_text(cpindex)\n",
    "print(similar_list)\n",
    "print('怀疑抄袭:\\n', news.iloc[cpindex].content)\n",
    "# 找一篇相似的原文\n",
    "similar2 = similar_list[0][0]\n",
    "print('相似原文:\\n', news.iloc[similar2].content)\n",
    "# 求任意两篇文章的编辑距离 \n",
    "print('编辑距离:',editdistance.eval(corpus[cpindex], corpus[similar2]))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 134,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "怀疑抄袭句:\r\n",
      "从这3年的中超13轮比赛过后的积分榜来看，会发现一个铁律，那就是所有在13轮比赛结束后，还低于10分的球队，最终全部降级，无一幸免\n",
      "相似原句:\\n　　令人稍显意外的是广州恒大，他们直到最后一轮还要确保不输球才能晋级\n",
      " 编辑距离:61\n",
      "\n",
      "怀疑抄袭句:\r\n",
      "从这3年的中超13轮比赛过后的积分榜来看，会发现一个铁律，那就是所有在13轮比赛结束后，还低于10分的球队，最终全部降级，无一幸免\n",
      "相似原句:\\n　　令很多中国球迷惋惜的是，虽然有三支中超球队晋级，但１６强战就将上演“中国德比”\n",
      " 编辑距离:62\n",
      "\n",
      "怀疑抄袭句:\r\n",
      "从这3年的中超13轮比赛过后的积分榜来看，会发现一个铁律，那就是所有在13轮比赛结束后，还低于10分的球队，最终全部降级，无一幸免\n",
      "相似原句:而淘汰赛尚未开打，另一项历史已经出现：苏宁将和上海上港队送出亚冠赛场上的首次“中国德比”\n",
      " 编辑距离:63\n",
      "\n",
      "怀疑抄袭句:\r\n",
      "从这3年的中超13轮比赛过后的积分榜来看，会发现一个铁律，那就是所有在13轮比赛结束后，还低于10分的球队，最终全部降级，无一幸免\n",
      "相似原句:这个赛季的苏宁堪称“亚冠球队”，尽管联赛八轮不胜，但在亚冠赛场他们队史首次出线，同时还缔造了一系列中超纪录\n",
      " 编辑距离:63\n",
      "\n",
      "怀疑抄袭句:\r\n",
      "从这3年的中超13轮比赛过后的积分榜来看，会发现一个铁律，那就是所有在13轮比赛结束后，还低于10分的球队，最终全部降级，无一幸免\n",
      "相似原句:虽然早早同室操戈，但这也将创造历史：这是中超球队在亚冠上的首度“德比”\n",
      " 编辑距离:63\n",
      "\n"
     ]
    }
   ],
   "source": [
    "def find_similar_sentence(candidate, raw):\n",
    "    similist = []\n",
    "    cl = candidate.strip().split('。')\n",
    "    ra = raw.strip().split('。')\n",
    "    for c in cl:\n",
    "        for r in ra:\n",
    "            similist.append([c,r,editdistance.eval(c,r)])\n",
    "    # 最相似的5个句子\n",
    "    sort=sorted(similist,key=lambda x:x[2])[:5]\n",
    "    for c,r,ed in sort:\n",
    "        if c!='' and r!='':\n",
    "            print('怀疑抄袭句:{0}\\n相似原句:{1}\\n 编辑距离:{2}\\n'.format(c,r,ed))\n",
    "# 查找copy文章 和第一相似的原文的比对\n",
    "find_similar_sentence(news.iloc[cpindex].content, news.iloc[similar2].content)\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
